{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Use OffPAC to Play Acrobot-v1\n",
    "\n",
    "PyTorch version"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "\n",
    "import sys\n",
    "import logging\n",
    "import imp\n",
    "import itertools\n",
    "\n",
    "import numpy as np\n",
    "np.random.seed(0)\n",
    "import pandas as pd\n",
    "import gym\n",
    "import matplotlib.pyplot as plt\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.optim as optim\n",
    "import torch.distributions as distributions\n",
    "torch.manual_seed(0)\n",
    "\n",
    "imp.reload(logging)\n",
    "logging.basicConfig(level=logging.DEBUG,\n",
    "        format='%(asctime)s [%(levelname)s] %(message)s',\n",
    "        stream=sys.stdout, datefmt='%H:%M:%S')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "20:35:40 [INFO] env: <AcrobotEnv<Acrobot-v1>>\n",
      "20:35:40 [INFO] action_space: Discrete(3)\n",
      "20:35:40 [INFO] observation_space: Box(-28.274333953857422, 28.274333953857422, (6,), float32)\n",
      "20:35:40 [INFO] reward_range: (-inf, inf)\n",
      "20:35:40 [INFO] metadata: {'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 15}\n",
      "20:35:40 [INFO] _max_episode_steps: 500\n",
      "20:35:40 [INFO] _elapsed_steps: None\n"
     ]
    }
   ],
   "source": [
    "env = gym.make('Acrobot-v1')\n",
    "env.seed(0)\n",
    "for key in vars(env):\n",
    "    logging.info('%s: %s', key, vars(env)[key])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "class OffPACAgent:\n",
    "    def __init__(self, env):\n",
    "        self.action_n = env.action_space.n\n",
    "        self.gamma = 0.99\n",
    "\n",
    "        self.actor_net = self.build_net(\n",
    "                input_size=env.observation_space.shape[0],\n",
    "                hidden_sizes=[100,],\n",
    "                output_size=env.action_space.n, output_activator=nn.Softmax(1))\n",
    "        self.actor_optimizer = optim.Adam(self.actor_net.parameters(), 0.0002)\n",
    "        self.critic_net = self.build_net(\n",
    "                input_size=env.observation_space.shape[0],\n",
    "                hidden_sizes=[100,], output_size=self.action_n)\n",
    "        self.critic_optimizer = optim.Adam(self.critic_net.parameters(), 0.0004)\n",
    "        self.critic_loss = nn.MSELoss()\n",
    "\n",
    "    def build_net(self, input_size, hidden_sizes, output_size,\n",
    "            output_activator=None):\n",
    "        layers = []\n",
    "        for input_size, output_size in zip(\n",
    "                [input_size,] + hidden_sizes, hidden_sizes + [output_size,]):\n",
    "            layers.append(nn.Linear(input_size, output_size))\n",
    "            layers.append(nn.ReLU())\n",
    "        layers = layers[:-1]\n",
    "        if output_activator:\n",
    "            layers.append(output_activator)\n",
    "        net = nn.Sequential(*layers)\n",
    "        return net\n",
    "\n",
    "    def reset(self, mode=None):\n",
    "        self.mode = mode\n",
    "        if self.mode == 'train':\n",
    "            self.trajectory = []\n",
    "            self.discount = 1.\n",
    "\n",
    "    def step(self, observation, reward, done):\n",
    "        if self.mode == 'train':\n",
    "            action = np.random.choice(self.action_n)\n",
    "            self.trajectory += [observation, reward, done, action]\n",
    "            if len(self.trajectory) >= 8:\n",
    "                self.learn()\n",
    "            self.discount *= self.gamma\n",
    "        else:\n",
    "            state_tensor = torch.as_tensor(observation, dtype=torch.float).unsqueeze(0)\n",
    "            prob_tensor = self.actor_net(state_tensor)\n",
    "            action_tensor = distributions.Categorical(prob_tensor).sample()\n",
    "            action = action_tensor.numpy()[0]\n",
    "        return action\n",
    "\n",
    "    def close(self):\n",
    "        pass\n",
    "\n",
    "    def learn(self):\n",
    "        state, _, _, action, next_state, reward, done, next_action = \\\n",
    "                self.trajectory[-8:]\n",
    "        state_tensor = torch.as_tensor(state, dtype=torch.float).unsqueeze(0)\n",
    "        next_state_tensor = torch.as_tensor(state, dtype=torch.float).unsqueeze(0)\n",
    "\n",
    "        # train actor\n",
    "        q_tensor = self.critic_net(state_tensor)[0, action]\n",
    "        pi_tensor = self.actor_net(state_tensor)[0, action]\n",
    "        behavior_prob = 1. / self.action_n\n",
    "        actor_loss_tensor = -self.discount * q_tensor / behavior_prob * pi_tensor\n",
    "        self.actor_optimizer.zero_grad()\n",
    "        actor_loss_tensor.backward()\n",
    "        self.actor_optimizer.step()\n",
    "\n",
    "        # train critic\n",
    "        next_q_tensor = self.critic_net(next_state_tensor)[:, next_action]\n",
    "        target_tensor = reward + (1. - done) * self.gamma * next_q_tensor\n",
    "        pred_tensor = self.critic_net(state_tensor)\n",
    "        critic_loss_tensor = self.critic_loss(pred_tensor, target_tensor)\n",
    "        pi_tensor = self.actor_net(state_tensor)[0, action]\n",
    "        ratio_tensor = pi_tensor / behavior_prob # importance sampling ratio\n",
    "        critic_loss_tensor *= ratio_tensor\n",
    "        self.critic_optimizer.zero_grad()\n",
    "        critic_loss_tensor.backward()\n",
    "        self.critic_optimizer.step()\n",
    "\n",
    "\n",
    "agent = OffPACAgent(env)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "20:35:41 [INFO] ==== train ====\n",
      "20:35:44 [DEBUG] train episode 0: reward = -500.00, steps = 500\n",
      "20:35:46 [DEBUG] train episode 1: reward = -500.00, steps = 500\n",
      "20:35:49 [DEBUG] train episode 2: reward = -500.00, steps = 500\n",
      "20:35:52 [DEBUG] train episode 3: reward = -500.00, steps = 500\n",
      "20:35:55 [DEBUG] train episode 4: reward = -500.00, steps = 500\n",
      "20:35:57 [DEBUG] train episode 5: reward = -500.00, steps = 500\n",
      "20:36:00 [DEBUG] train episode 6: reward = -500.00, steps = 500\n",
      "20:36:03 [DEBUG] train episode 7: reward = -500.00, steps = 500\n",
      "20:36:05 [DEBUG] train episode 8: reward = -500.00, steps = 500\n",
      "20:36:08 [DEBUG] train episode 9: reward = -500.00, steps = 500\n",
      "20:36:11 [DEBUG] train episode 10: reward = -500.00, steps = 500\n",
      "20:36:13 [DEBUG] train episode 11: reward = -500.00, steps = 500\n",
      "20:36:16 [DEBUG] train episode 12: reward = -500.00, steps = 500\n",
      "20:36:18 [DEBUG] train episode 13: reward = -500.00, steps = 500\n",
      "20:36:21 [DEBUG] train episode 14: reward = -500.00, steps = 500\n",
      "20:36:23 [DEBUG] train episode 15: reward = -500.00, steps = 500\n",
      "20:36:26 [DEBUG] train episode 16: reward = -500.00, steps = 500\n",
      "20:36:29 [DEBUG] train episode 17: reward = -500.00, steps = 500\n",
      "20:36:31 [DEBUG] train episode 18: reward = -500.00, steps = 500\n",
      "20:36:33 [DEBUG] train episode 19: reward = -500.00, steps = 500\n",
      "20:36:36 [DEBUG] train episode 20: reward = -500.00, steps = 500\n",
      "20:36:38 [DEBUG] train episode 21: reward = -500.00, steps = 500\n",
      "20:36:41 [DEBUG] train episode 22: reward = -500.00, steps = 500\n",
      "20:36:43 [DEBUG] train episode 23: reward = -500.00, steps = 500\n",
      "20:36:46 [DEBUG] train episode 24: reward = -500.00, steps = 500\n",
      "20:36:48 [DEBUG] train episode 25: reward = -500.00, steps = 500\n",
      "20:36:50 [DEBUG] train episode 26: reward = -500.00, steps = 500\n",
      "20:36:53 [DEBUG] train episode 27: reward = -500.00, steps = 500\n",
      "20:36:55 [DEBUG] train episode 28: reward = -500.00, steps = 500\n",
      "20:36:58 [DEBUG] train episode 29: reward = -500.00, steps = 500\n",
      "20:37:00 [DEBUG] train episode 30: reward = -500.00, steps = 500\n",
      "20:37:02 [DEBUG] train episode 31: reward = -500.00, steps = 500\n",
      "20:37:05 [DEBUG] train episode 32: reward = -500.00, steps = 500\n",
      "20:37:07 [DEBUG] train episode 33: reward = -500.00, steps = 500\n",
      "20:37:10 [DEBUG] train episode 34: reward = -500.00, steps = 500\n",
      "20:37:12 [DEBUG] train episode 35: reward = -500.00, steps = 500\n",
      "20:37:14 [DEBUG] train episode 36: reward = -500.00, steps = 500\n",
      "20:37:17 [DEBUG] train episode 37: reward = -500.00, steps = 500\n",
      "20:37:19 [DEBUG] train episode 38: reward = -500.00, steps = 500\n",
      "20:37:22 [DEBUG] train episode 39: reward = -500.00, steps = 500\n",
      "20:37:24 [DEBUG] train episode 40: reward = -500.00, steps = 500\n",
      "20:37:27 [DEBUG] train episode 41: reward = -500.00, steps = 500\n",
      "20:37:29 [DEBUG] train episode 42: reward = -500.00, steps = 500\n",
      "20:37:32 [DEBUG] train episode 43: reward = -500.00, steps = 500\n",
      "20:37:34 [DEBUG] train episode 44: reward = -500.00, steps = 500\n",
      "20:37:36 [DEBUG] train episode 45: reward = -500.00, steps = 500\n",
      "20:37:39 [DEBUG] train episode 46: reward = -500.00, steps = 500\n",
      "20:37:41 [DEBUG] train episode 47: reward = -500.00, steps = 500\n",
      "20:37:43 [DEBUG] train episode 48: reward = -500.00, steps = 500\n",
      "20:37:46 [DEBUG] train episode 49: reward = -500.00, steps = 500\n",
      "20:37:48 [DEBUG] train episode 50: reward = -500.00, steps = 500\n",
      "20:37:51 [DEBUG] train episode 51: reward = -500.00, steps = 500\n",
      "20:37:53 [DEBUG] train episode 52: reward = -500.00, steps = 500\n",
      "20:37:56 [DEBUG] train episode 53: reward = -500.00, steps = 500\n",
      "20:37:58 [DEBUG] train episode 54: reward = -500.00, steps = 500\n",
      "20:38:00 [DEBUG] train episode 55: reward = -500.00, steps = 500\n",
      "20:38:03 [DEBUG] train episode 56: reward = -500.00, steps = 500\n",
      "20:38:05 [DEBUG] train episode 57: reward = -500.00, steps = 500\n",
      "20:38:08 [DEBUG] train episode 58: reward = -500.00, steps = 500\n",
      "20:38:10 [DEBUG] train episode 59: reward = -500.00, steps = 500\n",
      "20:38:12 [DEBUG] train episode 60: reward = -500.00, steps = 500\n",
      "20:38:14 [DEBUG] train episode 61: reward = -500.00, steps = 500\n",
      "20:38:16 [DEBUG] train episode 62: reward = -500.00, steps = 500\n",
      "20:38:19 [DEBUG] train episode 63: reward = -500.00, steps = 500\n",
      "20:38:21 [DEBUG] train episode 64: reward = -500.00, steps = 500\n",
      "20:38:24 [DEBUG] train episode 65: reward = -500.00, steps = 500\n",
      "20:38:26 [DEBUG] train episode 66: reward = -500.00, steps = 500\n",
      "20:38:28 [DEBUG] train episode 67: reward = -500.00, steps = 500\n",
      "20:38:31 [DEBUG] train episode 68: reward = -500.00, steps = 500\n",
      "20:38:33 [DEBUG] train episode 69: reward = -500.00, steps = 500\n",
      "20:38:36 [DEBUG] train episode 70: reward = -500.00, steps = 500\n",
      "20:38:38 [DEBUG] train episode 71: reward = -500.00, steps = 500\n",
      "20:38:40 [DEBUG] train episode 72: reward = -500.00, steps = 500\n",
      "20:38:43 [DEBUG] train episode 73: reward = -500.00, steps = 500\n",
      "20:38:45 [DEBUG] train episode 74: reward = -298.00, steps = 299\n",
      "20:38:48 [DEBUG] train episode 75: reward = -443.00, steps = 444\n",
      "20:38:50 [DEBUG] train episode 76: reward = -319.00, steps = 320\n",
      "20:38:52 [DEBUG] train episode 77: reward = -178.00, steps = 179\n",
      "20:38:55 [DEBUG] train episode 78: reward = -302.00, steps = 303\n",
      "20:38:57 [DEBUG] train episode 79: reward = -149.00, steps = 150\n",
      "20:38:59 [DEBUG] train episode 80: reward = -156.00, steps = 157\n",
      "20:39:01 [DEBUG] train episode 81: reward = -185.00, steps = 186\n",
      "20:39:04 [DEBUG] train episode 82: reward = -203.00, steps = 204\n",
      "20:39:06 [DEBUG] train episode 83: reward = -165.00, steps = 166\n",
      "20:39:08 [DEBUG] train episode 84: reward = -112.00, steps = 113\n",
      "20:39:10 [DEBUG] train episode 85: reward = -105.00, steps = 106\n",
      "20:39:13 [DEBUG] train episode 86: reward = -187.00, steps = 188\n",
      "20:39:15 [DEBUG] train episode 87: reward = -151.00, steps = 152\n",
      "20:39:17 [DEBUG] train episode 88: reward = -210.00, steps = 211\n",
      "20:39:20 [DEBUG] train episode 89: reward = -263.00, steps = 264\n",
      "20:39:22 [DEBUG] train episode 90: reward = -241.00, steps = 242\n",
      "20:39:24 [DEBUG] train episode 91: reward = -247.00, steps = 248\n",
      "20:39:26 [DEBUG] train episode 92: reward = -208.00, steps = 209\n",
      "20:39:29 [DEBUG] train episode 93: reward = -214.00, steps = 215\n",
      "20:39:31 [DEBUG] train episode 94: reward = -218.00, steps = 219\n",
      "20:39:33 [DEBUG] train episode 95: reward = -162.00, steps = 163\n",
      "20:39:35 [DEBUG] train episode 96: reward = -163.00, steps = 164\n",
      "20:39:37 [DEBUG] train episode 97: reward = -199.00, steps = 200\n",
      "20:39:40 [DEBUG] train episode 98: reward = -210.00, steps = 211\n",
      "20:39:42 [DEBUG] train episode 99: reward = -190.00, steps = 191\n",
      "20:39:44 [DEBUG] train episode 100: reward = -320.00, steps = 321\n",
      "20:39:46 [DEBUG] train episode 101: reward = -137.00, steps = 138\n",
      "20:39:49 [DEBUG] train episode 102: reward = -131.00, steps = 132\n",
      "20:39:51 [DEBUG] train episode 103: reward = -153.00, steps = 154\n",
      "20:39:53 [DEBUG] train episode 104: reward = -136.00, steps = 137\n",
      "20:39:55 [DEBUG] train episode 105: reward = -103.00, steps = 104\n",
      "20:39:58 [DEBUG] train episode 106: reward = -101.00, steps = 102\n",
      "20:40:00 [DEBUG] train episode 107: reward = -124.00, steps = 125\n",
      "20:40:02 [DEBUG] train episode 108: reward = -130.00, steps = 131\n",
      "20:40:04 [DEBUG] train episode 109: reward = -97.00, steps = 98\n",
      "20:40:06 [DEBUG] train episode 110: reward = -118.00, steps = 119\n",
      "20:40:08 [DEBUG] train episode 111: reward = -155.00, steps = 156\n",
      "20:40:11 [DEBUG] train episode 112: reward = -169.00, steps = 170\n",
      "20:40:13 [DEBUG] train episode 113: reward = -127.00, steps = 128\n",
      "20:40:15 [DEBUG] train episode 114: reward = -135.00, steps = 136\n",
      "20:40:17 [DEBUG] train episode 115: reward = -118.00, steps = 119\n",
      "20:40:19 [DEBUG] train episode 116: reward = -117.00, steps = 118\n",
      "20:40:21 [DEBUG] train episode 117: reward = -140.00, steps = 141\n",
      "20:40:23 [DEBUG] train episode 118: reward = -108.00, steps = 109\n",
      "20:40:26 [DEBUG] train episode 119: reward = -89.00, steps = 90\n",
      "20:40:28 [DEBUG] train episode 120: reward = -112.00, steps = 113\n",
      "20:40:30 [DEBUG] train episode 121: reward = -141.00, steps = 142\n",
      "20:40:32 [DEBUG] train episode 122: reward = -114.00, steps = 115\n",
      "20:40:34 [DEBUG] train episode 123: reward = -90.00, steps = 91\n",
      "20:40:34 [INFO] ==== test ====\n",
      "20:40:34 [DEBUG] test episode 0: reward = -135.00, steps = 136\n",
      "20:40:34 [DEBUG] test episode 1: reward = -130.00, steps = 131\n",
      "20:40:35 [DEBUG] test episode 2: reward = -312.00, steps = 313\n",
      "20:40:35 [DEBUG] test episode 3: reward = -106.00, steps = 107\n",
      "20:40:35 [DEBUG] test episode 4: reward = -92.00, steps = 93\n",
      "20:40:35 [DEBUG] test episode 5: reward = -106.00, steps = 107\n",
      "20:40:35 [DEBUG] test episode 6: reward = -119.00, steps = 120\n",
      "20:40:35 [DEBUG] test episode 7: reward = -121.00, steps = 122\n",
      "20:40:35 [DEBUG] test episode 8: reward = -100.00, steps = 101\n",
      "20:40:35 [DEBUG] test episode 9: reward = -102.00, steps = 103\n",
      "20:40:35 [DEBUG] test episode 10: reward = -87.00, steps = 88\n",
      "20:40:35 [DEBUG] test episode 11: reward = -139.00, steps = 140\n",
      "20:40:35 [DEBUG] test episode 12: reward = -82.00, steps = 83\n",
      "20:40:35 [DEBUG] test episode 13: reward = -95.00, steps = 96\n",
      "20:40:36 [DEBUG] test episode 14: reward = -102.00, steps = 103\n",
      "20:40:36 [DEBUG] test episode 15: reward = -82.00, steps = 83\n",
      "20:40:36 [DEBUG] test episode 16: reward = -100.00, steps = 101\n",
      "20:40:36 [DEBUG] test episode 17: reward = -102.00, steps = 103\n",
      "20:40:36 [DEBUG] test episode 18: reward = -113.00, steps = 114\n",
      "20:40:36 [DEBUG] test episode 19: reward = -99.00, steps = 100\n",
      "20:40:36 [DEBUG] test episode 20: reward = -129.00, steps = 130\n",
      "20:40:36 [DEBUG] test episode 21: reward = -125.00, steps = 126\n",
      "20:40:36 [DEBUG] test episode 22: reward = -97.00, steps = 98\n",
      "20:40:36 [DEBUG] test episode 23: reward = -120.00, steps = 121\n",
      "20:40:36 [DEBUG] test episode 24: reward = -171.00, steps = 172\n",
      "20:40:36 [DEBUG] test episode 25: reward = -121.00, steps = 122\n",
      "20:40:37 [DEBUG] test episode 26: reward = -110.00, steps = 111\n",
      "20:40:37 [DEBUG] test episode 27: reward = -205.00, steps = 206\n",
      "20:40:37 [DEBUG] test episode 28: reward = -168.00, steps = 169\n",
      "20:40:37 [DEBUG] test episode 29: reward = -86.00, steps = 87\n",
      "20:40:37 [DEBUG] test episode 30: reward = -131.00, steps = 132\n",
      "20:40:37 [DEBUG] test episode 31: reward = -212.00, steps = 213\n",
      "20:40:37 [DEBUG] test episode 32: reward = -146.00, steps = 147\n",
      "20:40:37 [DEBUG] test episode 33: reward = -97.00, steps = 98\n",
      "20:40:37 [DEBUG] test episode 34: reward = -76.00, steps = 77\n",
      "20:40:38 [DEBUG] test episode 35: reward = -129.00, steps = 130\n",
      "20:40:38 [DEBUG] test episode 36: reward = -97.00, steps = 98\n",
      "20:40:38 [DEBUG] test episode 37: reward = -84.00, steps = 85\n",
      "20:40:38 [DEBUG] test episode 38: reward = -122.00, steps = 123\n",
      "20:40:38 [DEBUG] test episode 39: reward = -106.00, steps = 107\n",
      "20:40:38 [DEBUG] test episode 40: reward = -119.00, steps = 120\n",
      "20:40:38 [DEBUG] test episode 41: reward = -97.00, steps = 98\n",
      "20:40:38 [DEBUG] test episode 42: reward = -94.00, steps = 95\n",
      "20:40:38 [DEBUG] test episode 43: reward = -101.00, steps = 102\n",
      "20:40:38 [DEBUG] test episode 44: reward = -109.00, steps = 110\n",
      "20:40:38 [DEBUG] test episode 45: reward = -89.00, steps = 90\n",
      "20:40:38 [DEBUG] test episode 46: reward = -108.00, steps = 109\n",
      "20:40:38 [DEBUG] test episode 47: reward = -87.00, steps = 88\n",
      "20:40:39 [DEBUG] test episode 48: reward = -142.00, steps = 143\n",
      "20:40:39 [DEBUG] test episode 49: reward = -86.00, steps = 87\n",
      "20:40:39 [DEBUG] test episode 50: reward = -79.00, steps = 80\n",
      "20:40:39 [DEBUG] test episode 51: reward = -78.00, steps = 79\n",
      "20:40:39 [DEBUG] test episode 52: reward = -92.00, steps = 93\n",
      "20:40:39 [DEBUG] test episode 53: reward = -122.00, steps = 123\n",
      "20:40:39 [DEBUG] test episode 54: reward = -105.00, steps = 106\n",
      "20:40:39 [DEBUG] test episode 55: reward = -124.00, steps = 125\n",
      "20:40:39 [DEBUG] test episode 56: reward = -107.00, steps = 108\n",
      "20:40:39 [DEBUG] test episode 57: reward = -101.00, steps = 102\n",
      "20:40:39 [DEBUG] test episode 58: reward = -88.00, steps = 89\n",
      "20:40:39 [DEBUG] test episode 59: reward = -105.00, steps = 106\n",
      "20:40:40 [DEBUG] test episode 60: reward = -370.00, steps = 371\n",
      "20:40:40 [DEBUG] test episode 61: reward = -104.00, steps = 105\n",
      "20:40:40 [DEBUG] test episode 62: reward = -113.00, steps = 114\n",
      "20:40:40 [DEBUG] test episode 63: reward = -188.00, steps = 189\n",
      "20:40:40 [DEBUG] test episode 64: reward = -218.00, steps = 219\n",
      "20:40:40 [DEBUG] test episode 65: reward = -193.00, steps = 194\n",
      "20:40:40 [DEBUG] test episode 66: reward = -101.00, steps = 102\n",
      "20:40:40 [DEBUG] test episode 67: reward = -120.00, steps = 121\n",
      "20:40:40 [DEBUG] test episode 68: reward = -109.00, steps = 110\n",
      "20:40:41 [DEBUG] test episode 69: reward = -113.00, steps = 114\n",
      "20:40:41 [DEBUG] test episode 70: reward = -95.00, steps = 96\n",
      "20:40:41 [DEBUG] test episode 71: reward = -85.00, steps = 86\n",
      "20:40:41 [DEBUG] test episode 72: reward = -100.00, steps = 101\n",
      "20:40:41 [DEBUG] test episode 73: reward = -95.00, steps = 96\n",
      "20:40:41 [DEBUG] test episode 74: reward = -286.00, steps = 287\n",
      "20:40:41 [DEBUG] test episode 75: reward = -118.00, steps = 119\n",
      "20:40:41 [DEBUG] test episode 76: reward = -92.00, steps = 93\n",
      "20:40:41 [DEBUG] test episode 77: reward = -121.00, steps = 122\n",
      "20:40:41 [DEBUG] test episode 78: reward = -97.00, steps = 98\n",
      "20:40:42 [DEBUG] test episode 79: reward = -219.00, steps = 220\n",
      "20:40:42 [DEBUG] test episode 80: reward = -93.00, steps = 94\n",
      "20:40:42 [DEBUG] test episode 81: reward = -99.00, steps = 100\n",
      "20:40:42 [DEBUG] test episode 82: reward = -121.00, steps = 122\n",
      "20:40:42 [DEBUG] test episode 83: reward = -120.00, steps = 121\n",
      "20:40:42 [DEBUG] test episode 84: reward = -112.00, steps = 113\n",
      "20:40:42 [DEBUG] test episode 85: reward = -91.00, steps = 92\n",
      "20:40:42 [DEBUG] test episode 86: reward = -100.00, steps = 101\n",
      "20:40:42 [DEBUG] test episode 87: reward = -104.00, steps = 105\n",
      "20:40:42 [DEBUG] test episode 88: reward = -154.00, steps = 155\n",
      "20:40:42 [DEBUG] test episode 89: reward = -159.00, steps = 160\n",
      "20:40:43 [DEBUG] test episode 90: reward = -126.00, steps = 127\n",
      "20:40:43 [DEBUG] test episode 91: reward = -94.00, steps = 95\n",
      "20:40:43 [DEBUG] test episode 92: reward = -100.00, steps = 101\n",
      "20:40:43 [DEBUG] test episode 93: reward = -106.00, steps = 107\n",
      "20:40:43 [DEBUG] test episode 94: reward = -114.00, steps = 115\n",
      "20:40:43 [DEBUG] test episode 95: reward = -103.00, steps = 104\n",
      "20:40:43 [DEBUG] test episode 96: reward = -89.00, steps = 90\n",
      "20:40:43 [DEBUG] test episode 97: reward = -112.00, steps = 113\n",
      "20:40:43 [DEBUG] test episode 98: reward = -96.00, steps = 97\n",
      "20:40:43 [DEBUG] test episode 99: reward = -101.00, steps = 102\n",
      "20:40:43 [INFO] average episode reward = -120.20 ± 46.76\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX8AAAD4CAYAAAAEhuazAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3deZRcZ3nn8e9T1V3Vu9Sb1pbUkixkywZsubENBmKCwQpJsMMkczyEAwnJOPhAJmEmIXGcQDKJzzAhmSQMMcQDhHUwngCxQ2LWELZ4QTbCq2Rr3yx3t9RS77W+88e9t7qqunqtrm7dqt/nnD7qulVdde9V9dNPPe9z39ecc4iISG2JrPQOiIjI8lPwFxGpQQr+IiI1SMFfRKQGKfiLiNSgupXegfnq6upyvb29K70bIiKh8thjjw0657qLt4cm+Pf29rJ3796V3g0RkVAxs2OltqvsIyJSgxT8RURqkIK/iEgNUvAXEalBCv4iIjVIwV9EpAYp+IuI1CAFfxGRZTYymeK+vSeYa0r98WSaRDpTkX1Q8BcRWWafe/g47/uHJ/jJyQuzPu4Lj57gpR/4BkNjySXfBwV/EZFl9v3nBwB4/NjQrI/bd+I8XS0x2ptjS74PZQV/M/slM3vazLJm1ld03x1mdtDMDpjZTXnbrzazJ/37PmxmVs4+iIiEyUQyw96jXtB//PhcwX+IKzevrsh+lJv5PwW8Bfhe/kYz2wXcClwO7AHuNrOof/dHgduAHf7XnjL3QURkyWSyjj//2n5OnZ+oyPM/evQcyUyWrpb4rJn/2dEEJ85NcOWmizD4O+eedc4dKHHXzcC9zrmEc+4IcBC4xszWA23OuYecN9LxGeCWcvZBRGQpHTgzwt3/doh7Hz1ekef//nMDxKIRfvX6Xk5fmOTMhcmSj9t34jwAV25qr8h+VKrmvxE4kXf7pL9to/998XYRkWVx/Ow4f/H1A3zwwf0l7z80MArAI0fOVeT1f3BwkL7edq6/pAuYufSz78R5ohHjio1tFdmPOad0NrNvAetK3HWnc+7+mX6sxDY3y/aZXvs2vBIRmzdvnmNPRURm5pzjt7+4j/v3nc5t+7VXb6W7NV7wuCD47ztxnslUhob6KEulf3iS/WdGeN+enexa30a8LsLjx4Z400vXT3vsvhPnecnaVppilZl5f87M3zl3o3PuihJfMwV+8DL6TXm3e4DT/vaeEttneu17nHN9zrm+7u5paxGIyDL5+tNneO8X983Zl34x6x9JcP++07zlqo387Vt3A6Wz7kMDYwAk09lc6WWp/ODgIACv3dFNrC7CSzeu4rES+5DNOvadOF+xej9UruzzAHCrmcXNbCvewO6jzrkXgBEzu87v8nk7MNsfERFZYYcHRnnvF/fxlR+fIpHOrvTuLNqhfi+j/8Wre7hx1xpi0UjJAddD/aPs3rwaM3h0iUs/P3h+kI7mGLvWe6Wc3VvaefrU8LQLuQ4PjjEymeaqizX4m9kvmNlJ4JXAP5vZ1wGcc08D9wHPAF8D3u2cC47uduDjeIPAh4AHy9kHEamcRDrDb37hx4wnvV/f4YnUCu/R4gXlnG3dLcTroly+sW1a5p/NOg4PjrJ7czuXrmvjkSNnl3QfHj16juu2dRCJeBXw3ZtXk8xkeerUcMHjcoO9FWrzhPK7fb7inOtxzsWdc2udczfl3XeXc267c26nc+7BvO17/bLRdufce1yYP0eKVLkPPrifp08Pc8uVGwAYngxz8B+jORZlbZtX49+9uZ0nTl4gmfdp5vSFCSZTWbavaeHarR08dmyo4P5yjCfTnBya4NJ1UwO4uzd7nTw/LvojtO/EEC3xOrZ3tyzJa5eiK3xFpKRM1vHZh47xi1f38Au7vaG6CxPpFd6rxTs0MMr2NS0E15VevaWdRDrLMy8M5z3Gq/dv7/aC/2Qqy5OnZp+CYb4O+899yZqpgL6mrYGNqxunjS3sO3Gel/WsIhqp3DWwCv4iUtLZsQTprOPlPatoa/A6TsJc9jk8MFaQSQdZd37dPxgX2NbdzDVbOwAWXfopLmoc9J87P/gDXLa+lQNnRnK3E+kM+18Y4WU9lSv5gIK/iMygfzgBQHdrA22N9UB4yz7jyTSnzk+wvbs5t23dqgY2rGoo6LY5NDDKqsZ6OptjdLbE2bGmhUcOz3/QdzSR5r//0zO85e4fsuv9X+eOLz+Zu+9g/yjRiNHb2VzwMzvXtXJ4cCw36Pv8i6Oks65i/f0BBX8RKWlgxAv+a9ritDX4wT+kmf/hvHJOvqu2tPPjvMzf+3TQnCsN7d7czlMLKPv8009O88kfHiFixsb2Rr717Iu5TwDP94+wpaOJWF1h2L10XRuZrMt9Mnj6tPd6l29YtcCjXBgFfxEpqX/Em3aguyVOa1D2mQxnzT/o9NleVHLZvbmd0xcmeeHCRO5x+X8geruaOTuWZGSen3i+/eyLbFzdyP971yt5x6t6GRhJcHLIe+6D/aPTXh/g0nWtALnSz9Onh2mORdnS0bTAo1wYBX8RKWmq7BOnoT5KvC4S2sz/0MAYEYMtnYUB9eotXt3/kcPnGJ5M0T+SKAjQweOPnR2f8zUmkhl+cHCQN+xai5mx22/TfPz4EKlMlmNnx9lRIvhv7WomFo3kgv8zp4e5bH1brh20UhT8RaSk/pEEqxrrc9MbtDXWh7bmf2hglE0dTcTrCqdquHxDG72dTfzpV5/h3w54c+znZ/5B8D9+rnTwP5G3/YcHB5lMZXn9ZWsA2Lm2laZYlMePDXHs7BjprJs22AtQF41wyZoWnj0zQjbrePaFYS7fUNl6Pyj4i8gMBkYSrMmb96atoY7hkLZ6HuofLdkzXx+N8IlfeQXprON37vsJQMGg8BZ/cPbo2bFpP7v/zDCv+fPv8NmHjwHw7f0v0hKv49qtnYAX1F/es5rHj5+fsdMncOm6Vg6cGebo2THGkpmK1/tBwV9EZtA/MsmatrzgH4LMfzKVyQ2YBjJZx5HBsYKgnm97dwsfe9vVOBx1EWNTXq29JV5HV0uM4yXKPs+/6AX0D/7Ls5w6P8G3nu3np17SXTCge/WWdp55YZgn/OUaZ7po69L1rbw4nOCHh7y20l3K/EVkpfSPJFjT2pC73dZQf1HX/NOZLO/63GP8/P/+Af3DU3Pknz4/QSKdnfVq2Vdu7+Qjb93Nb/70DuqjhWFxS2dzycw/GMjNOMevfPJRBkYS3LhrTcFjdm9ZTSbruH/faTasaqA5XnqGzp3+Vb9ffvwkdRFjx9rKXdkbUPAXkWmcc37wL878L86yj3OOP/knr26fdfCjo4W9+zC906fYTZev47du3DFt+5aOppIDvqfOj7O6qZ7feeNOnu8fJWJww0sKg/9V/kIsp85PzPr6l/kdPz8+fp4da1unjU1UgoK/iEwzPJEmmc4WzHXv1fwvzsz/7394lM8+fIx3Xr+VhvoIPzo6dWFWUG/f1lW67DOXLZ3NvHBhkslU4cybJ4cm2Li6kV+9fitXb2nnNTu6py203t4cY5tfbtqxpnXG1+hujdPe5F1LsRyDvaDgLyIl5Hr8p2X+qYtyTv+PffcQr9reyR/+7GVctamdvcemgv/3nx+kt7OJzpb4LM8ws94ubwzgRFHHz6mhCXraG4lGjC/85+v4+Dv6Sv58MI3ETIO9AGaWm/BNwV9EVkx/cHVvUc0/lXFMpi6+Of0nkhl2rmslEjFe0dvOM6eHGU2kGU2keejQWW68bO2in3tzx/Ref+ccp85PsHG1d1+sLjJtrCAQXEswVx1/p1/6Ceb6r7TKrA8mIqGWP7VDYFXe/D6NscrXpBcikc7m6uR9vR1kHew7fp6RyRTJTJYbdy0++PeWaPccGk8xnsywsb1xzp+/5cqNZJ3j6s2zL8R+w85uvvvcAFdsrHybJyj4i0gJQdmncMB3ambPtW0NJX9uJTjnSGayuRbLqzavJmLwo6PnODk0warGevq2zB54Z7O6qZ7WhrqCzP+U3+nTM4/g3xiL8svXbpnzcTfsXMMNO9fM+bilouAvItP0DydorI/SkteamJvc7SLr9U9mvDJU3A/+rQ31XLbeW4XruRdHed3ObupmKMnMh5k3E+exvJr/qfPe9xtXzx38L1aq+YvUiP7hSf7Hg88ylpi7XbN/JMGatnhudktgalrni+wq32ClrVhegH9FbwcPHz7HubFkWSWfwJbOJo7llX1OLiDzv1gp+IvUiB8cHOTvvnuYP7r/qTkf2z8yWVDyAaYWdPEz/8ePD3HP9w4t/Y4uUC74511Z29frlXnqIsZrX9Jd9mts6Wzi5NAEKf9TxsmhCVridblxkDBS8BepEUGXzpcfP8WXHjs562P7RxIFbZ6Qn/l7wf/zDx/nf37tANnsyrZ+JtKFZR+Avi3eKlzXbevMlavKsaWzmUzWcfq8l/F7nT6NBZ+MwkbBX6RGBBcp7Vrfxh/d/xSH/StfSxkomtoBmDan/7GzY2SyjgsrfOFXqcx/3aoGfuVVvdz22m1L8hrB3PqHB73Sz8mhiXl1+lzMFPxFakSQIX/krVcRjRgf+deDJR83mcowMpmelvnH66I01EdywT5ofRwcTVRwr+cWDPgWr5D1x2++fElKPgCXbWgjVhfh3/b3A3BqaDzU9X5Q8BepGUHm39vZzCt6O3j69HDJxwWLuBTX/GFqcreRyRSDo0mA3L8rpdSA71Jra6jnDbvW8sBPTnN2NMHwZDrUnT6g4C9SMxLpLLFohEjEuHRdK4cGRnOBM1+ux79EL38wxUN+z/vZsaXJ/A/2j/L1p89wcmh8QVNIJEqUfSrhF3f3MDSe4vOPHAcIfdlHff4iNWIylckNiu5c10o66zg0MMplRdMJTE3tUCrz9xZ0OTI41fY4OLI0wf8PvvIkjx45l3vtf3z39WyYR3adSHufaCo9E+ZrdnTR1RLjEz84AoS7xx+U+YvUjEQ6Q9xfkjGYRCxYNzZfMBd+yeCfy/y94B8xODu2NGWfMxcmuf6STt5740voH0nwxMnz8/q5UgO+lVAXjXDzlRtzYx497ZVdYL3SFPxFakQilaWh3vuV39bdTH3U2F8i+J8cmqChPkJH0fTEMFXzPzI4ztq2OB3N8SWr+Q+MJLhsXRu/9pqtABwZnHvRdJgK/vEKB3+At+zemHutrpbp5ydMVPYRqRGT6amyT300wvbuFvafmT7oe+zcOJs7mkr2sLc11jE8mebY2TF6O5u5MJFakm6fsUSaiVSGrta4v3RivOCK2tnM1O1TCbvWt3HpulZSmWyoe/xBwV+kZniZ/1Rd/LL1bTx8+Oy0xx0/O87mjtILnwSZ/9GzY7z+0rWcPD/O2SUI/sEsot3+nPu9nU0F4wqzWY5un4CZ8Te3XsVY8uKa4mIxVPYRqRGT6UxB8N+5rpUXLkxyYXzqIi3nHMfPjbOls3Q9u62xnnTWMTiapLerma6W+JLU/Af8PyBd/jhDb1fpdXNLyV3hW7884WznutbcAi1hpuAvUiMmU9mCuniweEh+6WdgNMFEKpNbwKRY/lQJvZ1NdDbHl6TbZ7BE5v/icILxeWTYy5n5VxOdLZEqdGE8xQfuf6pg3dlEUeZ/md/xkz/oe9zv3988Y+Y/VSnu7WqmsyXGWDLDRDJT8vHzNZX5x3LPDZRcOL3YcnX7VBudLZEq9PCRs3z6oWMFV/EWZ/5r2+KsaqwvCP5BsN0yj8x/S2dTLlMv90KvwZEEEYPO5iDzD4L/3KWf5RzwrSY6WyJVKMj48zPy4szfzNi5rpUDeWWf4+fGMZv56tVgCuO1bXGaYnV0+u2O5bZ7Dowm6GiOEY14HTRB5j+fds+Eyj6LorMlUoVywT+v7DOZ1+cfuGxdKwfOjOSmZT5+bpwNqxpnvFo2mNY5yMw7g8y/zI6fgZEkXS1TF5UF7Z5H59Hxk0hniNVFQt96udwU/EWqUDB3f/6AqTe9Q2FQ37WhjbFkhkP+9M7Hzo7NONgLUwu6BME/uNDp7BJk/sWziPZ2Ns2r4yeZzhJX1r9gOmMiVSjI+AsHfLPT2iGvv6QLgO8+NwB4mf+swb+xns7mGLu3rAamavQDZWb+gyOJ3PhBYL7tnsl0VvX+RdAZE6lCQdAf92v+2azzMuSizL+nvYlL1rTw3ecGGEukGRxNztjpA96VwQ/d8Xr+Y98mABpjUZpj0bIyf+ccA6OJXI9/YKZ2z/963z7+/odHcrcV/BdHZ0ykCk2VfbzgH3TEFNf8AV63s5tHDp/jwIte189MF3gFiuvrnS3xsrp9RhJpkulsycwfCts9z48n+cqPT+Vm/wTv2JZjXp9qU9YZM7MPmdl+M3vCzL5iZqvz7rvDzA6a2QEzuylv+9Vm9qR/34dNozQiS26yqOwT/NtQYiD3hp1rSGayfPHREwCzln1K6WqJlTW/TzC1Q9DjHwjGFfIHfR85cg7nCgeyEyll/otR7hn7JnCFc+5lwHPAHQBmtgu4Fbgc2APcbWbBu+6jwG3ADv9rT5n7ICJFiss+wSeBUlMg9PW20xSL8o/7TgGwZYZ5fWbS2RIvq+wzdXVv4eIxQeZ/NC/zf+iQNxdRfgtrMqPgvxhlnTHn3Decc0FB7mGgx//+ZuBe51zCOXcEOAhcY2brgTbn3EPOW6rnM8At5eyDiExXHPyDBU9KZf7xuiiv2t5FIp1lVWM9q5rqpz1mNl7mv/jgX3x1b6BUu+e/HxoECgeyk/4KZbIwS3nG3gk86H+/ETiRd99Jf9tG//vi7SKyhIq7fWbL/AFu2OktdL7Qkg9AV0ucc2MJMtn5L72Yr3hGz3xXblrFt/f3M5nKMDCS4LkXvZbUieLgr8x/weY8Y2b2LTN7qsTXzXmPuRNIA58PNpV4KjfL9ple+zYz22tmewcGBubaVRHxFff5z5b5Q17wn2Owt5TO5hhZ5w3GLsbgaIJoxGhvmr44yjtfvZXB0QRfevwkD/nTT2/vbi6s+WemdzHJ3Oacz985d+Ns95vZO4CfA17vplZdPglsyntYD3Da395TYvtMr30PcA9AX1/f4tIKkRo0MUPNP396h3w97U287brNvPqS7gW/VnCV7+BoMvf9QgyMJOhsjhGJTM8NX7mtk5dvWs3fffcw123roLWhjr4tHXx7/4u5xyRSGWIllpyU2ZXb7bMH+D3gzc65/Ek4HgBuNbO4mW3FG9h91Dn3AjBiZtf5XT5vB+4vZx9EZLrEDN0+s815/2e3vJQ9V6xb8Gt1LWCKh/ypJAKDo8lpV/cGzIzbf2obx8+N86XHT3Ht1k5aGuo04LsEyj1jHwFagW+a2T4z+xiAc+5p4D7gGeBrwLudc8H/1u3Ax/EGgQ8xNU4gIkukuM8/mPxsprJPOYIpHgbnWNTlYP8IN/319/jAA08zVSTwMv+uWT4xvHHXOrZ1N5PJOl61vZPG+igTqUzuOTS9w+KU2+1ziXNuk3PuSv/rXXn33eWc2+6c2+mcezBv+17n3BX+fe9x+e8CEVkSE0UTu+X6/Cuw2lVQ6hmYY1GXg/3eYO1nHz7GJ394NLd9sMS8PvkiEeM9r7uEiMFrX9JNYyxK1k1duKYB38XRGr4iVah4SufcUocVyPzbm+qpj9qcwf/4Oa8y/NqXdPNn//wMPe2NvHHXWgZHZ8/8Ad6yu4dX7+hiTWsD33vOO4bJpDfQqyt8F0dnTKQKTb/Iq3KZv5nR3RKnf2Ry1scdPzfOqsZ6/u5tV/OyntXc/rnH+MtvPEcq42bN/ANrWr2LwBpjXvAPPtXoCt/F0RkTqUJBzb+47FOplsju1vg8Mv8JNnc00RiL8vlfv5Y9V6zjI985CEyNG8xHY31h8NeA7+LojIlUmUzWkcxkiUaMZDpLJuumyj4VyPwBulsb6B+ePfifzJsuuiVex9++dTd/+LOX0dUS44qNq+b9WkG76kQyQybryGQdsaj6/BdKwV+kygQXdAUXTY0n07nWz0rVxte0xWed0z+TdZwcmqCnY2p5SDPj11+zjb1/+Aa2d7fM+7Wmyj5pLd5eBp0xkSoTDPJ2NHtz9EykMt5CLhVc6nBNa5xzY8lcMC724vAkyUx2UdNHFMuVfZLZ3OtpwHfhdMZEqsykHxCDzH8imWEylZnx6t6lEAzGzjS1c9Dps6TBP5XJfcpR5r9wOmMiVSYY3J0q+2SYTFW2HXKN363TP8Og75IG/5h3HMEnGlDwXwydMZEqE5R92pv9zN/PkCua+bf5wX+4dLvnyXPjRAw2rG4sef9CBMcxmczkLvRS2WfhdMZEqkxQCulszi/7ZCvS4x8Iyj6zZf4bVjdSvwTTMDTFvGtTJ1KZqQFfTe+wYDpjIlUm6PFf7S/KMpH0Mv9KTnvc2RLDbPbgv6m9/JIPFNb8kxVuYa1mOmMiVWaq28ev+acqn/nXRyN0NMUYmOEq3+ACr6UQlHi8P2pB5q8+/4VS8BepMpPpopp/Ml3xzB9mvsp3IplhcDSxqIViSolEjIb6SGHZRzX/BdMZE6kyQdmno2n5av4Aa9oaCso+39nfz1gizYkhr9Nn0xJl/uCVfiaSGZIZtXouls6YSJUJ5rwpKPukM8Qr2O0DXrtnMMXD/jPD/OqnfsS7PvcYhwe8qZyXquwD5Ob014Dv4mlKZ5EqE0zlsKqpHjO/Nl7hPn/wgv/gaIJs1vHokXMAfP/5QZ73F13f1F5+m2egIRYt6PPXgO/C6YyJVJnc9M110Vx5pNJ9/uAF/3TWcW48yY+ODrF+VQO337CdM8OTNMeiuU8iS6GxPspkwYCvQtlCKfMXqTITqQzRiFEfNZpiUcZTy5P5dwe9/sMJHjt6jr7eDn73jTsZHEkwPJla0nmFmmKFZR9d5LVwCv4iVWYylaXBn8StMeZlyJPLkfn7V/nuO3Ge0xcm+Y0t7UQixod+6eVL/loN9VFGE5rVsxw6YyJVZiKVyU173FgfZSSRJpVxFVm8PV8wv8+DT70AQF9ve8Vea6rbR8F/sXTGRKrMZGqqp78xVsf58SRQ+UHRYIqHfz90lpZ4HZeua6vYazXG1O1TLp0xkSqTyOvpb6yPMDSeAqChwtlxYyxKa7yOTNaxe0s70Uhl1g6AvMw/7a1YVqfgv2A6YyJVJr/s05SX+Ve65g/Q7df9X7GlciUf8I4lmK1UWf/i6KyJVJnJVCZX32+MRXOZ/3L0wgd1/77ejoq+TmMsyqRf9lG9f3F01kSqTP6qXY31UTJZB1DxAV/w6v51EePKTasr+jqN9VFSGcd4MqPgv0hq9RSpMhOpLB3NQdlnKuAvR+b/y9du5qrNq3Nlp0oJpnW+MJFSj/8iKfiLVJlEKpM34DsVhJcj8792WyfXbuus+OsEf1wuTKSU+S+SzppIlSko+xRk/tUz531+5q8B38XRWROpMhOpTC44FpR9qihDDv6onR9X2WexdNZEqsxkQZ9/XtmnWjN/Bf9F0VkTqSLOuYJ5fBpjU8N61ZQhN+St41vpFcqqVfW8G0SERDqLc1PBMb/sU1WZf95xKfNfHJ01kSqS8JdwzO/zD1R6GcfllH9cGvBdHJ01kSoSLN6eq/kXDPhWUeZfr8y/XDprIlVkIukF/8aizD9iUB+t3ERry01ln/LprIlUkanMv7DmH6+LLulKWiutsUpbWJeTzppIFZnM1fwLyz7VVO+Hwumplfkvjs6aSBUJyj7FA77V1OkDUBeN5AZ6FfwXR2dNpIpML/t4ff7VWBoJPs3E1e2zKDprIlUkkfKDf11Q6/d+xast84epkpYy/8Up66yZ2Z+a2RNmts/MvmFmG/Luu8PMDprZATO7KW/71Wb2pH/fh62aRqFEVtiEH/yDwBiJGI310aqa1C0QlLSqqYV1OZX7J/NDzrmXOeeuBL4KvB/AzHYBtwKXA3uAu80s+B/6KHAbsMP/2lPmPojUpA99fT/vv/+pgm3FA77gdfxUZ9lHmX85yjprzrnhvJvNgPO/vxm41zmXcM4dAQ4C15jZeqDNOfeQc84BnwFuKWcfRGrV48fOs/foUMG23IBvXeG0DtVY9mlS2acsZS/mYmZ3AW8HLgCv8zdvBB7Oe9hJf1vK/754+0zPfRvepwQ2b95c7q6KVJVUJpsb4A0Et/P74Fc11tMar751m3I1fw34LsqcZ83MvmVmT5X4uhnAOXenc24T8HngPcGPlXgqN8v2kpxz9zjn+pxzfd3d3XMfjUgNSWayubl8AkHZJ7/M86Ffehnv27NzWfdtOTSq7FOWOdMB59yN83yu/wv8M/ABvIx+U959PcBpf3tPie0iskDJdJbJVFHmn8oQr4sUXM17+YZVy71ry6KhvrCjSRam3G6fHXk33wzs979/ALjVzOJmthVvYPdR59wLwIiZXed3+bwduL+cfRCpVclM6eBf6cXTLxbK/MtTbiHwg2a2E8gCx4B3ATjnnjaz+4BngDTwbudc8C69HfgU0Ag86H+JyAJ5Nf/isk9mWRZqvxioz788ZQV/59x/mOW+u4C7SmzfC1xRzuuKiFf2yWQdqUyWen/QcyJvCcdq16iyT1l01kRCKpXxeiXySz+TqUxVtnWWkuvzj9bG8S41BX+RkEr6JZ/JvI6fWgr+QdknXiOfdJaazppISCUzQfAvzvxr49e6SX3+ZdFZEwkh51xe5j8V/CdSmdxMntVuW1cLrfE6OlpiK70roVQb7xKRKpPOTl0bmV/2GU9mCta3rWav3tHFk39y09wPlJKU+YuEUDKvxTN/iofJZO30+Ut5FPxFQiiVKRzkDYynMrlauMhsFPxFQqgg888r+0zUUNlHyqPgLxJCifT0zD+TdSTSWZV9ZF4U/EVCqFTZJ7eKlzJ/mQcFf5EQSuYHf/9TQLCQi2r+Mh8K/iIhlEpPtXoGi7YHwb+xRvr8pTwK/iIhlMwUXtULMJ5KAyr7yPwo+IuEUDI9/SIvlX1kIRT8RUIoWWrANzl9/V6RmSj4i4RQqsQVvur2kYVQ8BcJofzMfyLpfT+uso8sgIK/SAgFff4Ry8v8/eBfK/P5S3kU/EVCKLjCtyVeN9XqmVLmL/On4C8SQkHm39ZYn+v2mSr7qM9f5qbgLxJCwcRurQ31ed0+Xp+/FjSX+dC7RCSEcpl/Q11Bt09jfZRIxKcp4cAAAAs7SURBVFZy1yQkFPxFQijI/IvLPqr3y3wp+IuEUDLjXeHbGq8ruMhLnT4yXwr+IiGUTGeJRSM0xKJT0ztoFS9ZAAV/kRBKZbLUR42Gumiu1VNlH1kIBX+REEqms8TqIjTURwoGfFX2kflS8BcJIS/zj9BQHyWVcWSyjgll/rIACv4iIZSf+YM3s+d4Mq0ZPWXeFPxFQiiR8Qd8/TLPZCrDZCpLY72u7pX5UfAXCaFUkPnXecF/ws/8VfaR+VLwFwmhpF/zj+fKPlnGkxmVfWTeFPxFQiiVCWr+fuafzJBIZ7WQi8ybgr9ICOUu8vKD/fmJJKDpnGX+FPxFQiiZcdTXRWjwZ/A8N+YFf5V9ZL4U/EVCKMj8g2B/fjwFaP1emT8Ff5EQ8mr+liv7KPOXhVLwFwmhXM3fb/UcGlfNXxZGwV8khKamd/B+hYdyZR9d5CXzsyTB38x+x8ycmXXlbbvDzA6a2QEzuylv+9Vm9qR/34fNTMsOiSxQML1D3C/7DKnsIwtUdvA3s03AG4Djedt2AbcClwN7gLvNLHhXfhS4Ddjhf+0pdx9Eak2yKPMPav4q+8h8LUXm/1fA+wCXt+1m4F7nXMI5dwQ4CFxjZuuBNufcQ845B3wGuGUJ9kGkpiTTWeJ1EWLRCGZw3q/5q9tH5qus4G9mbwZOOed+UnTXRuBE3u2T/raN/vfF22d6/tvMbK+Z7R0YGChnV0WqSlDzN/MWdDk3rrKPLMyco0Nm9i1gXYm77gT+AHhjqR8rsc3Nsr0k59w9wD0AfX19Mz5OpJakM1myDmL+BV4N9ZHcgK/KPjJfcwZ/59yNpbab2UuBrcBP/DHbHuBxM7sGL6PflPfwHuC0v72nxHYRmaeUv3h7fTQI/lHAC/5B66fIXBZd9nHOPemcW+Oc63XO9eIF9t3OuTPAA8CtZhY3s614A7uPOudeAEbM7Dq/y+ftwP3lH4ZI7UimvQXbpzL/qP9vhEhEzXMyPxVpCnbOPW1m9wHPAGng3c65jH/37cCngEbgQf9LROYpmfGDf9QL9HH/j0BTTD3+Mn9L9m7xs//823cBd5V43F7giqV6XZFakwv+ftAPBnnV6SMLoSt8RUImVVz28ev86vSRhVDwFwmZIPOfGvANyj4K/jJ/Cv4iIZMb8I0WD/gq+Mv8KfiLhEwu8y/q9lHmLwuh4C8SMkHmH1fZR8qg4C8SMqmizD9ep7KPLJyCv0jIzFTzV+YvC6HgLxIyqRm6fdTnLwuh4C8SMokZpndo1BW+sgAK/iIhE0zsliv71GnAVxZOwV8kZGaa2E1lH1kIBX+RkJmq+XsTu+Xm9lHmLwug4C8SMsWZf9DqqbKPLISCv0jIzDS3j8o+shAK/iIhU9znv6a1AYC1bQ0rtk8SPuoNEwkZb/F2y63atWtDG9/73dexubNphfdMwkSZv0jIJNPZXMknoMAvC6XgLxIyqUw2N9grslh6B4mETDIzPfMXWSi9g0RCJpl2ucFekcXSO0gkZJIq+8gS0DtIJGRS6awyfymb3kEiIZPMZKmvs5XeDQk5BX+RkEkq85cloHeQSMio20eWgt5BIiGTTGvAV8qnd5BIyKQyKvtI+fQOEgkZZf6yFPQOEgmZlGr+sgT0DhIJGWX+shT0DhIJmWTGKfhL2fQOEgmZZDqjAV8pm95BIiGTUuYvS0DvIJGQSforeYmUQ8FfJEQyWUcm64hFtVi7lEfBXyREUhlv8XZN7CblUvAXCZGkH/w14Cvl0jtIJESSaT/4a8BXyqR3kEiIpJT5yxIp6x1kZn9sZqfMbJ//9aa8++4ws4NmdsDMbsrbfrWZPenf92EzU/FSZJ6CzF/TO0i5luId9FfOuSv9r38BMLNdwK3A5cAe4G4zC9oTPgrcBuzwv/YswT6I1IRc5q+yj5SprkLPezNwr3MuARwxs4PANWZ2FGhzzj0EYGafAW4BHqzQfvDrn/4Rx86OV+rpRZZVQpm/LJGlCP7vMbO3A3uB/+acGwI2Ag/nPeakvy3lf1+8vSQzuw3vUwKbN29e1M5t7mhWliRVpa+3nVf0tq/0bkjIzRn8zexbwLoSd92JV8L5U8D5//4l8E6gVB3fzbK9JOfcPcA9AH19fTM+bjbv//ldi/kxEZGqNmfwd87dOJ8nMrP/A3zVv3kS2JR3dw9w2t/eU2K7iIgso3K7fdbn3fwF4Cn/+weAW80sbmZb8QZ2H3XOvQCMmNl1fpfP24H7y9kHERFZuHJr/n9uZlfilW6OAr8B4Jx72szuA54B0sC7nXMZ/2duBz4FNOIN9FZssFdEREoz5xZVSl92fX19bu/evSu9GyIioWJmjznn+oq3qw1GRKQGKfiLiNQgBX8RkRqk4C8iUoNCM+BrZgPAsUX+eBcwuIS7sxKq4RigOo6jGo4BquM4quEYoLLHscU51128MTTBvxxmtrfUaHeYVMMxQHUcRzUcA1THcVTDMcDKHIfKPiIiNUjBX0SkBtVK8L9npXdgCVTDMUB1HEc1HANUx3FUwzHAChxHTdT8RUSkUK1k/iIikkfBX0SkBlV18DezPf4C8gfN7PdXen/my8w2mdl3zOxZM3vazH7L395hZt80s+f9fy/65ZzMLGpmPzazr/q3w3gMq83sH8xsv/9/8sqwHYeZvdd/Lz1lZl8ws4YwHIOZfdLM+s3sqbxtM+63md3h/74fMLObVmavC81wDB/y309PmNlXzGx13n3LcgxVG/z9BeP/FvgZYBfwn/yF5cMgjbck5mXAdcC7/X3/feDbzrkdwLf92xe73wKezbsdxmP4G+BrzrlLgZfjHU9ojsPMNgL/Behzzl0BRIFbCccxfArYU7St5H77vyO3Apf7P3O3HwdW2qeYfgzfBK5wzr0MeA64A5b3GKo2+APXAAedc4edc0ngXryF5S96zrkXnHOP+9+P4AWbjXj7/2n/YZ8GblmZPZwfM+sBfhb4eN7msB1DG/Ba4BMAzrmkc+48ITsOvLU7Gs2sDmjCW0Hvoj8G59z3gHNFm2fa75uBe51zCefcEeAgXhxYUaWOwTn3Dedc2r/5MFMrHC7bMVRz8N8InMi7Peti8RcrM+sFrgIeAdb6q6Hh/7tm5fZsXv4aeB+QzdsWtmPYBgwAf++Xrz5uZs2E6Dicc6eAvwCOAy8AF5xz3yBEx1Bkpv0O6+/8O5la1GrZjqGag/+CFou/GJlZC/Al4Ledc8MrvT8LYWY/B/Q75x5b6X0pUx2wG/ioc+4qYIyLszwyI78mfjOwFdgANJvZ21Z2ryoidL/zZnYnXpn388GmEg+ryDFUc/CfaRH5UDCzerzA/3nn3Jf9zS8G6yb7//av1P7Nw/XAm83sKF7J7afN7HOE6xjAex+ddM494t/+B7w/BmE6jhuBI865AedcCvgy8CrCdQz5ZtrvUP3Om9k7gJ8DftlNXXC1bMdQzcH/R8AOM9tqZjG8QZQHVnif5sVf3P4TwLPOuf+Vd9cDwDv8798B3L/c+zZfzrk7nHM9zrlevHP/r865txGiYwBwzp0BTpjZTn/T6/HWpg7TcRwHrjOzJv+99Xq8caQwHUO+mfb7AeBWM4ub2VZgB/DoCuzfnMxsD/B7wJudc+N5dy3fMTjnqvYLeBPeSPoh4M6V3p8F7Per8T7qPQHs87/eBHTidTc87//bsdL7Os/juQH4qv996I4BuBLY6/9//CPQHrbjAP4E2A88BXwWiIfhGIAv4I1TpPCy4l+bbb+BO/3f9wPAz6z0/s9yDAfxavvB7/fHlvsYNL2DiEgNquayj4iIzEDBX0SkBin4i4jUIAV/EZEapOAvIlKDFPxFRGqQgr+ISA36/+q6jxCvItE5AAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "def play_episode(env, agent, max_episode_steps=None, mode=None, render=False):\n",
    "    observation, reward, done = env.reset(), 0., False\n",
    "    agent.reset(mode=mode)\n",
    "    episode_reward, elapsed_steps = 0., 0\n",
    "    while True:\n",
    "        action = agent.step(observation, reward, done)\n",
    "        if render:\n",
    "            env.render()\n",
    "        if done:\n",
    "            break\n",
    "        observation, reward, done, _ = env.step(action)\n",
    "        episode_reward += reward\n",
    "        elapsed_steps += 1\n",
    "        if max_episode_steps and elapsed_steps >= max_episode_steps:\n",
    "            break\n",
    "    agent.close()\n",
    "    return episode_reward, elapsed_steps\n",
    "\n",
    "\n",
    "logging.info('==== train ====')\n",
    "episode_rewards = []\n",
    "for episode in itertools.count():\n",
    "    play_episode(env.unwrapped, agent,\n",
    "            max_episode_steps=env._max_episode_steps, mode='train')\n",
    "    episode_reward, elapsed_steps = play_episode(env, agent)\n",
    "    episode_rewards.append(episode_reward)\n",
    "    logging.debug('train episode %d: reward = %.2f, steps = %d',\n",
    "            episode, episode_reward, elapsed_steps)\n",
    "    if np.mean(episode_rewards[-10:]) > -120:\n",
    "        break\n",
    "plt.plot(episode_rewards)\n",
    "\n",
    "\n",
    "logging.info('==== test ====')\n",
    "episode_rewards = []\n",
    "for episode in range(100):\n",
    "    episode_reward, elapsed_steps = play_episode(env, agent)\n",
    "    episode_rewards.append(episode_reward)\n",
    "    logging.debug('test episode %d: reward = %.2f, steps = %d',\n",
    "            episode, episode_reward, elapsed_steps)\n",
    "logging.info('average episode reward = %.2f ± %.2f',\n",
    "        np.mean(episode_rewards), np.std(episode_rewards))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "env.close()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
